#!/usr/bin/env python 
# -*- coding: utf-8 -*-

"""
@author: zyx
@since: 2022/2/7 10:27
@file: kuaidaili_crawler.py
"""

# url：https://www.kuaidaili.com/free
# 需求：将前5页的所有id和port解析且存储到文件中

# https://www.kuaidaili.com/free/inha/i/
import time

import requests
from bs4 import BeautifulSoup

url = 'https://www.kuaidaili.com/free/inha/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}

for i in range(1, 6):
    print(f'===>>> 开始爬取第{i}页数据')
    page_text = requests.get(url + str(i), headers=headers).text

    soup = BeautifulSoup(page_text, 'lxml')
    for tr in soup.select('tbody > tr'):
        ip_attr = {'data-title': 'IP'}
        port_attr = {'data-title': 'PORT'}
        print((tr.find(name='td', attrs=ip_attr).get_text(), tr.find(name='td', attrs=port_attr).get_text()))
    print(f'===>>> 成功爬取第{i}页数据')

    time.sleep(2)


